Importing Packages¶

In [2]:
!pip install xlrd
!pip install imblearn
Defaulting to user installation because normal site-packages is not writeable
Requirement already satisfied: xlrd in /home/medhat/.local/lib/python3.10/site-packages (2.0.1)
Defaulting to user installation because normal site-packages is not writeable
Requirement already satisfied: imblearn in /home/medhat/.local/lib/python3.10/site-packages (0.0)
Requirement already satisfied: imbalanced-learn in /home/medhat/.local/lib/python3.10/site-packages (from imblearn) (0.10.1)
Requirement already satisfied: scipy>=1.3.2 in /home/medhat/.local/lib/python3.10/site-packages (from imbalanced-learn->imblearn) (1.9.0)
Requirement already satisfied: numpy>=1.17.3 in /home/medhat/.local/lib/python3.10/site-packages (from imbalanced-learn->imblearn) (1.23.5)
Requirement already satisfied: scikit-learn>=1.0.2 in /home/medhat/.local/lib/python3.10/site-packages (from imbalanced-learn->imblearn) (1.1.2)
Requirement already satisfied: threadpoolctl>=2.0.0 in /home/medhat/.local/lib/python3.10/site-packages (from imbalanced-learn->imblearn) (3.1.0)
Requirement already satisfied: joblib>=1.1.1 in /home/medhat/.local/lib/python3.10/site-packages (from imbalanced-learn->imblearn) (1.2.0)
In [3]:
# dataframe package
import pandas as pd
import numpy as np
from pandasql import sqldf

# Plotting Packages
import matplotlib.pyplot as plt
%matplotlib inline

# Viz Packages
import holoviews as hv
from holoviews import opts, dim
from pandas.plotting import parallel_coordinates

# ML Packages
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, ConfusionMatrixDisplay, confusion_matrix, roc_auc_score

# XGBoost model
from xgboost import XGBClassifier

# Oversampling package
from imblearn.over_sampling import SMOTE

hv.extension('bokeh')
In [4]:
clients = pd.read_excel("defaultofcreditcardclients.xls", header=1, index_col="ID")

I'll use the index column in the dataset and ignore the first row.

In [5]:
pd.set_option('display.max_columns', 30)

EDA¶

In [6]:
clients.head()
Out[6]:
LIMIT_BAL SEX EDUCATION MARRIAGE AGE PAY_0 PAY_2 PAY_3 PAY_4 PAY_5 PAY_6 BILL_AMT1 BILL_AMT2 BILL_AMT3 BILL_AMT4 BILL_AMT5 BILL_AMT6 PAY_AMT1 PAY_AMT2 PAY_AMT3 PAY_AMT4 PAY_AMT5 PAY_AMT6 default payment next month
ID
1 20000 female university married 24 2 2 -1 -1 -2 -2 3913 3102 689 0 0 0 0 689 0 0 0 0 1
2 120000 female university single 26 -1 2 0 0 0 2 2682 1725 2682 3272 3455 3261 0 1000 1000 1000 0 2000 1
3 90000 female university single 34 0 0 0 0 0 0 29239 14027 13559 14331 14948 15549 1518 1500 1000 1000 1000 5000 0
4 50000 female university married 37 0 0 0 0 0 0 46990 48233 49291 28314 28959 29547 2000 2019 1200 1100 1069 1000 0
5 50000 male university married 57 -1 0 -1 0 0 0 8617 5670 35835 20940 19146 19131 2000 36681 10000 9000 689 679 0
In [7]:
clients.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 30000 entries, 1 to 30000
Data columns (total 24 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   LIMIT_BAL                   30000 non-null  int64 
 1   SEX                         30000 non-null  object
 2   EDUCATION                   29669 non-null  object
 3   MARRIAGE                    29677 non-null  object
 4   AGE                         30000 non-null  int64 
 5   PAY_0                       30000 non-null  int64 
 6   PAY_2                       30000 non-null  int64 
 7   PAY_3                       30000 non-null  int64 
 8   PAY_4                       30000 non-null  int64 
 9   PAY_5                       30000 non-null  int64 
 10  PAY_6                       30000 non-null  int64 
 11  BILL_AMT1                   30000 non-null  int64 
 12  BILL_AMT2                   30000 non-null  int64 
 13  BILL_AMT3                   30000 non-null  int64 
 14  BILL_AMT4                   30000 non-null  int64 
 15  BILL_AMT5                   30000 non-null  int64 
 16  BILL_AMT6                   30000 non-null  int64 
 17  PAY_AMT1                    30000 non-null  int64 
 18  PAY_AMT2                    30000 non-null  int64 
 19  PAY_AMT3                    30000 non-null  int64 
 20  PAY_AMT4                    30000 non-null  int64 
 21  PAY_AMT5                    30000 non-null  int64 
 22  PAY_AMT6                    30000 non-null  int64 
 23  default payment next month  30000 non-null  int64 
dtypes: int64(21), object(3)
memory usage: 5.7+ MB

NULL values at columns: EDUCATION, MARRIAGE

In [8]:
clients.rename(columns={"PAY_0":"PAY_1", "default payment next month": "DEFAULTS"}, inplace=True)

Renamed columns PAY_0 and the target column for a more consistent names and easiness

In [9]:
clients["PAY_1"].value_counts()
Out[9]:
 0    14737
-1     5686
 1     3688
-2     2759
 2     2667
 3      322
 4       76
 5       26
 8       19
 6       11
 7        9
Name: PAY_1, dtype: int64
In [10]:
clients.hist(bins=50, figsize=(25,25), xrot=45)
plt.show()

A lot of outliers and higly skwed data. Another task to do during the data processing

Let's do more visualization

In [11]:
def stacked_bars_plot(data, attrs):
    if len(attrs) < 4:
        rows, cols = 1, len(attrs)
    else:
        rows = int(np.ceil(len(attrs) / 3))
        cols = 3
    fig, axs = plt.subplots(rows, cols, figsize = (25, rows * 10))
    
    for column, ax in zip(attrs, axs.flatten()):
        df = data.groupby('DEFAULTS')[column].value_counts(normalize=True).unstack(column)
        df.plot.bar(stacked=True, ax=ax, title=f"Normalized Bar Chart for {column}")
    
    plt.show()
In [12]:
cat_attrs = ["SEX", "EDUCATION", "MARRIAGE"]
stacked_bars_plot(clients, cat_attrs)

The distribution of these features seems similar, which suggets that there's no corrrelation between the demographs and whether someone will defaults or not.

In [13]:
pay_attrs = ["PAY_1", "PAY_2", "PAY_3", "PAY_4", "PAY_5", "PAY_6"]
stacked_bars_plot(clients, pay_attrs)    
In [14]:
clients.corr(numeric_only=True)["DEFAULTS"].sort_values(ascending=False)
Out[14]:
DEFAULTS     1.000000
PAY_1        0.324794
PAY_2        0.263551
PAY_3        0.235253
PAY_4        0.216614
PAY_5        0.204149
PAY_6        0.186866
AGE          0.013259
BILL_AMT6   -0.005372
BILL_AMT5   -0.006760
BILL_AMT4   -0.010156
BILL_AMT3   -0.014076
BILL_AMT2   -0.014193
BILL_AMT1   -0.019644
PAY_AMT6    -0.053183
PAY_AMT5    -0.055124
PAY_AMT3    -0.056250
PAY_AMT4    -0.056827
PAY_AMT2    -0.058579
PAY_AMT1    -0.072929
LIMIT_BAL   -0.153520
Name: DEFAULTS, dtype: float64

Data Visualization¶

In [15]:
chord_df = sqldf(
"""SELECT DISTINCT PAY_2,
                   PAY_1,
                   COUNT(PAY_1) AS COUNT
    FROM clients 
    GROUP BY PAY_2,
             PAY_1
;
""", globals())
In [27]:
chord = hv.Chord(chord_df, kdims=["PAY_2", "PAY_1"], vdims=["COUNT"])

chord.opts(opts.Chord(width=800, 
                      height=800,
                      labels=dim('index'), 
                      cmap='Category20', 
                      edge_color=dim('PAY_2'),
                      node_color= dim('index'),
                      title="The Payment Delay from August to September"
                     )
          )
Out[27]:

In the above chord diagram, I drew the change of the status of the clients between August and September. The chord diagram gives us an indication about how the clients behave. Most of the clients who payed minium on August did the same thing on September. Other insights can be deduced as well from this diagram.

Data Processing¶

Now, it's time to clean the data. We have two features with missing data

In [498]:
clients.EDUCATION.value_counts(dropna=False)
Out[498]:
university         14030
graduate school    10585
high school         4916
NaN                  331
others               124
0                     14
Name: EDUCATION, dtype: int64
In [499]:
clients.MARRIAGE.value_counts(dropna=False)
Out[499]:
single     15964
married    13659
NaN          323
0             54
Name: MARRIAGE, dtype: int64
In [500]:
clients.EDUCATION.replace({0: np.nan}, inplace=True)
clients.MARRIAGE.replace({0: np.nan}, inplace=True)
In [501]:
clients.EDUCATION.value_counts(dropna=False)
Out[501]:
university         14030
graduate school    10585
high school         4916
NaN                  345
others               124
Name: EDUCATION, dtype: int64
In [502]:
#instantiate both packages to use
encoder = OrdinalEncoder()
imputer = KNNImputer()

def encode(column):
    '''function to encode non-null data and replace it in the original data'''
    #retains only non-null values
    nonulls = np.array(column.dropna())
    
    #reshapes the data for encoding
    impute_reshape = nonulls.reshape(-1,1)
    
    #encode date
    impute_ordinal = encoder.fit_transform(impute_reshape)
    
    #Assign back encoded values to non-null values
    column.loc[column.notnull()] = np.squeeze(impute_ordinal)
    return column

#create a for loop to iterate through each column in the data
for column in cat_attrs:
    clients[column] = encode(clients[column].copy())
In [503]:
%%time
# impute data and convert 
encode_data = pd.DataFrame(np.round(imputer.fit_transform(clients)),
                           columns = clients.columns)
CPU times: user 4.74 s, sys: 15.1 s, total: 19.8 s
Wall time: 2.49 s
In [505]:
encode_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 24 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   LIMIT_BAL  30000 non-null  float64
 1   SEX        30000 non-null  float64
 2   EDUCATION  30000 non-null  float64
 3   MARRIAGE   30000 non-null  float64
 4   AGE        30000 non-null  float64
 5   PAY_1      30000 non-null  float64
 6   PAY_2      30000 non-null  float64
 7   PAY_3      30000 non-null  float64
 8   PAY_4      30000 non-null  float64
 9   PAY_5      30000 non-null  float64
 10  PAY_6      30000 non-null  float64
 11  BILL_AMT1  30000 non-null  float64
 12  BILL_AMT2  30000 non-null  float64
 13  BILL_AMT3  30000 non-null  float64
 14  BILL_AMT4  30000 non-null  float64
 15  BILL_AMT5  30000 non-null  float64
 16  BILL_AMT6  30000 non-null  float64
 17  PAY_AMT1   30000 non-null  float64
 18  PAY_AMT2   30000 non-null  float64
 19  PAY_AMT3   30000 non-null  float64
 20  PAY_AMT4   30000 non-null  float64
 21  PAY_AMT5   30000 non-null  float64
 22  PAY_AMT6   30000 non-null  float64
 23  DEFAULTS   30000 non-null  float64
dtypes: float64(24)
memory usage: 5.5 MB

Some values in the age column were unresonable, some ages were above 140

In [506]:
invalid_ages = encode_data[encode_data["AGE"] > 100].index
invalid_ages
Out[506]:
Int64Index([4011, 4116, 5395, 6963, 7318, 8940, 29496], dtype='int64')
In [507]:
encode_data.loc[invalid_ages, "AGE"] = 100
In [508]:
encode_data[encode_data["AGE"] > 100].index
Out[508]:
Int64Index([], dtype='int64')

Now that we have cleaned our data, we have to fix the issue of imbalanced data. I'll use SMOTE to fix this issue

In [509]:
encode_data["DEFAULTS"].hist()
plt.show()
In [510]:
X, y = SMOTE().fit_resample(encode_data.drop("DEFAULTS", axis=1), encode_data["DEFAULTS"])
In [521]:
y.hist()
plt.show()

Now our two classes are balanced

Classification¶

In [512]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

1. Randome Forest Model¶

1.1 Default Hyperparameters¶

In [513]:
%%time
forest = RandomForestClassifier()
forest.fit(X_train, y_train)
CPU times: user 5.59 s, sys: 4.07 ms, total: 5.6 s
Wall time: 5.6 s
Out[513]:
RandomForestClassifier()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier()
In [514]:
forest_y_pred = forest.predict(X_test)
In [515]:
print(classification_report(y_test, forest_y_pred))
              precision    recall  f1-score   support

         0.0       0.85      0.93      0.89      7698
         1.0       0.92      0.84      0.88      7723

    accuracy                           0.88     15421
   macro avg       0.88      0.88      0.88     15421
weighted avg       0.88      0.88      0.88     15421

In [516]:
cm = confusion_matrix(y_test, forest_y_pred)
ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=forest.classes_).plot()
plt.show()

1.2 Hyperparameter Tuning¶

In [398]:
params = {'max_depth': [10, 50,100],
          'n_estimators': [100, 500, 1000]
         }
In [399]:
%%time
forest_tuned = GridSearchCV(estimator=model, 
                           param_grid=params,
                           scoring='recall', 
                           verbose=0)
forest_tuned.fit(X_train, y_train)
CPU times: user 16min 13s, sys: 1.01 s, total: 16min 14s
Wall time: 16min 14s
Out[399]:
GridSearchCV(estimator=RandomForestClassifier(),
             param_grid={'max_depth': [10, 50, 100],
                         'n_estimators': [100, 500, 1000]},
             scoring='recall')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(estimator=RandomForestClassifier(),
             param_grid={'max_depth': [10, 50, 100],
                         'n_estimators': [100, 500, 1000]},
             scoring='recall')
RandomForestClassifier()
RandomForestClassifier()
In [400]:
forest_tuned_y_pred = forest_tuned.predict(X_test)
In [406]:
print(classification_report(y_test, forest_tuned_y_pred))
              precision    recall  f1-score   support

         0.0       0.85      0.92      0.88      7691
         1.0       0.92      0.84      0.87      7730

    accuracy                           0.88     15421
   macro avg       0.88      0.88      0.88     15421
weighted avg       0.88      0.88      0.88     15421

In [407]:
cm = confusion_matrix(y_test, forest_tuned_y_pred)
ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=forest_tuned.classes_).plot()
plt.show()

2. XGBoost Model¶

2.1 Default Hyperparameters¶

In [517]:
%%timeit
xgmodel = XGBClassifier()
xgmodel.fit(X_train, y_train)
1.15 s ± 142 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
In [518]:
xg_y_pred = xgmodel.predict(X_test)
In [519]:
print(classification_report(y_test, xg_y_pred))
              precision    recall  f1-score   support

         0.0       0.86      0.96      0.91      7698
         1.0       0.96      0.85      0.90      7723

    accuracy                           0.90     15421
   macro avg       0.91      0.90      0.90     15421
weighted avg       0.91      0.90      0.90     15421

In [520]:
cm = confusion_matrix(y_test, xg_y_pred)
ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=xgmodel.classes_).plot()
plt.show()

2.2 Hyperparameter Tuning¶

In [401]:
params = { 'max_depth': [3,6,10],
          'learning_rate': [0.01, 0.08, 0.1],
          'n_estimators': [100, 500, 1000],
          'colsample_bytree': [0.3, 0.7, 1]
         }
In [402]:
%%time
xg_tuned = GridSearchCV(estimator=xgmodel, 
                   param_grid=params,
                   scoring='recall', 
                   verbose=1)
xg_tuned.fit(X_train, y_train)
Fitting 5 folds for each of 81 candidates, totalling 405 fits
CPU times: user 7h 14min 59s, sys: 1min 23s, total: 7h 16min 22s
Wall time: 27min 28s
Out[402]:
GridSearchCV(estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     callbacks=None, colsample_bylevel=1,
                                     colsample_bynode=1, colsample_bytree=1,
                                     early_stopping_rounds=None,
                                     enable_categorical=False, eval_metric=None,
                                     feature_types=None, gamma=0, gpu_id=-1,
                                     grow_policy='depthwise',
                                     importance_type=None,
                                     interaction_constraints='',
                                     learning_rate=0.300000012, max_...
                                     max_cat_threshold=64, max_cat_to_onehot=4,
                                     max_delta_step=0, max_depth=6,
                                     max_leaves=0, min_child_weight=1,
                                     missing=nan, monotone_constraints='()',
                                     n_estimators=100, n_jobs=0,
                                     num_parallel_tree=1, predictor='auto',
                                     random_state=0, ...),
             param_grid={'colsample_bytree': [0.3, 0.7, 1],
                         'learning_rate': [0.01, 0.08, 0.1],
                         'max_depth': [3, 6, 10],
                         'n_estimators': [100, 500, 1000]},
             scoring='recall', verbose=1)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     callbacks=None, colsample_bylevel=1,
                                     colsample_bynode=1, colsample_bytree=1,
                                     early_stopping_rounds=None,
                                     enable_categorical=False, eval_metric=None,
                                     feature_types=None, gamma=0, gpu_id=-1,
                                     grow_policy='depthwise',
                                     importance_type=None,
                                     interaction_constraints='',
                                     learning_rate=0.300000012, max_...
                                     max_cat_threshold=64, max_cat_to_onehot=4,
                                     max_delta_step=0, max_depth=6,
                                     max_leaves=0, min_child_weight=1,
                                     missing=nan, monotone_constraints='()',
                                     n_estimators=100, n_jobs=0,
                                     num_parallel_tree=1, predictor='auto',
                                     random_state=0, ...),
             param_grid={'colsample_bytree': [0.3, 0.7, 1],
                         'learning_rate': [0.01, 0.08, 0.1],
                         'max_depth': [3, 6, 10],
                         'n_estimators': [100, 500, 1000]},
             scoring='recall', verbose=1)
XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, feature_types=None, gamma=0, gpu_id=-1,
              grow_policy='depthwise', importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_bin=256, max_cat_threshold=64, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0, ...)
XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, feature_types=None, gamma=0, gpu_id=-1,
              grow_policy='depthwise', importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_bin=256, max_cat_threshold=64, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0, ...)
In [403]:
xg_tuned_y_pred = xg_tuned.predict(X_test)
In [404]:
print(classification_report(y_test, xg_tuned_y_pred))
              precision    recall  f1-score   support

         0.0       0.84      0.93      0.88      7691
         1.0       0.92      0.83      0.87      7730

    accuracy                           0.88     15421
   macro avg       0.88      0.88      0.88     15421
weighted avg       0.88      0.88      0.88     15421

In [405]:
cm = confusion_matrix(y_test, xg_tuned_y_pred)
ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=xg_tuned.classes_).plot()
plt.show()

Conclusion¶

We can see that our xgmodel has the best performance and with the default parameters. Depending on our business case and the required accuracy, we can decide if furthur tuning should be done or not. Also, we might consider other model like ANN or Bayesian Inference.

In [ ]: